# Keep things nice and tidy, all libraries go here
library(readxl)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(svglite)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggsci)
data <- read_excel("data/data_IEEE.xlsx", skip = 1)
## New names:
## * `` -> ...35
data <- data %>% filter(is.na(Exclude)) 
# Years without any publication (for easy slicing)
years_no_publications <- c("1974",
                           "1975",
                           "1976",
                           "1978")

# LABELS so slicing will not become a mess
swebok_areas_labels = c("SR",
                        "SD",
                        "SC",
                        "ST",
                        "SM",
                        "SCM",
                        "SEM",
                        "SEP",
                        "SEMM",
                        "SQ",
                        "SEPP",
                        "SEE",
                        "CF",
                        "MF",
                        "EF")

swebok_areas_labels_no_foundation = c("SR",
                                      "SD",
                                      "SC",
                                      "ST",
                                      "SM",
                                      "SCM",
                                      "SEM",
                                      "SEP",
                                      "SEMM",
                                      "SQ",
                                      "SEPP",
                                      "SEE")

swebok_areas_labels_long = c("Requirements",
                                "Design",
                                "Construction",
                                "Testing",
                                "Maintainance",
                                "Config. Mgmt.",
                                "SE Mgmt.",
                                "SE Processes",
                                "SE Models&Methods",
                                "Software Quality",
                                "SE Prof. Practice",
                                "SE Economics")

cognitive_concepts_labels <- c("Attention", 
                               "Selective attention",
                               "Divided attention",
                               "Sustained attention",
                               "Memory",
                               "Working memory",
                               "Short-term memory",
                               "Long-term memory",
                               "Cognitive load",
                               # "Cognitive control",
                               "Intrinsic CL",
                               "Extrinsic CL",
                               "Perception",
                               "Problem solving",
                               "Reasoning",
                               "Decision making",
                               "Cognitive biases",
                               "Knowledge",
                               "Explicit knowledge",
                               "Tacit knowledge",
                               "Techn. tacit knowl.",
                               "Cogn. tacit knowl.")

measures_labels <- c("Qualit. measures",
                      "Fieldwork",
                      "Interview",
                      "Task-based",
                      "Open observation",
                      "Quantit. measures",
                      "Task performance",
                      "Physiological meas.",
                      "Subjective ratings",
                      "Behavioral meas.")

# COLORS 
tol9qualitative=c("#332288",
                  "#88CCEE",
                  "#44AA99",
                  "#117733",
                  "#999933",
                  "#DDCC77",
                  "#CC6677",
                  "#882255",
                  "#AA4499")

NPG_modified=c("#F5E144",
               "#4DBBD5FF",
               "#00A087FF",
               "#3C5488FF",
               "#F39B7FFF",
               "#8491B4FF",
               "#91D1C2FF",
               "#DC0000FF",
               "#7E6148FF")

# Necessary for groupying by high-level category
add_high_level_concepts_to_data <- function(data) {
  data %>%  
    mutate(Concept = case_when(
    Taxonomy %in% c("Attention", "Selective attention", "Divided attention", "Sustained attention") ~ "Attention", 
    Taxonomy %in% c("Memory", "Working memory", "Short-term memory", "Long-term memory") ~ "Memory", 
    Taxonomy %in% c("Cognitive control", "Cognitive load", "Extrinsic CL", "Intrinsic CL") ~ "Cognitive load", 
    Taxonomy == "Perception" ~ "Perception", 
    Taxonomy %in% c("Problem solving", "Reasoning", "Decision making") ~ "Reasoning", 
    Taxonomy %in% c("Cognitive biases") ~ "Cognitive biases", 
    Taxonomy %in% c("Knowledge", "Explicit knowledge", "Tacit knowledge",  "Techn. tacit knowl.", "Cogn. tacit knowl.") ~ "Knowledge", 
    )) 
  }

Visualizing number of publications over time

ggplot(data, aes(x=as.factor(Year))) + 
  geom_bar() +  
  ylab("Number of publications") +
  xlab("Year") + 
  geom_text(stat='count', aes(label=..count..), vjust=2, color="white", size = 2.5) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

ggsave("PDFs/yearly_distribution.pdf")
## Saving 7 x 5 in image
# Cleaning not needed values 
data<-data %>% 
    mutate(Academia = replace(Academia, Academia == "?", NA)) %>%
    mutate(Industry = replace(Industry, Industry == "?", NA))

data<-data %>%
  mutate(Type = case_when(is.na(Academia) & is.na(Industry) ~ "None", 
                       Academia == "1" & is.na(Industry) ~ "Academia",
                       Industry == "1" & is.na(Academia) ~ "Industry", 
                       TRUE ~ "Both"))

Number of publications according to their type

data %>% 
  mutate(Type = fct_infreq(Type, ordered = T)) %>% 
ggplot(aes(x=Type)) + 
  geom_bar(width = .5) +
  xlab("Type of publication") + 
  ylab("Number of publications") + 
  geom_text(stat='count', aes(label=..count..), vjust=3, color="white", size = 4) +
  theme_bw()

ggsave("PDFs/academia_industry_distribution.pdf")
## Saving 7 x 5 in image

Number of publications categorized according to SWEBoK Areas.

A publication can be in more than one category at the same time.

data %>% 
  select(all_of(swebok_areas_labels)) %>% # selecting columns corresponding to the SWEBoK Areas
  mutate_all(replace_na,0) %>% 
  summarise_all(sum) %>% 
  gather(key = "SWEBOKArea", value = "publications", 1:15) %>% 
  arrange(-publications) %>%  
  mutate(SWEBOKArea = factor(SWEBOKArea, SWEBOKArea)) %>% 
  ggplot(aes(x=SWEBOKArea, y=publications)) + 
  geom_bar(stat="identity") + 
  geom_text(aes(label=publications), vjust=-0.3, color="black", size = 4) + 
  xlab("SWEBoK Area") + 
  ylab("Number of publications") +
  theme_bw()

ggsave("PDFs/swebok_distribution.pdf")
## Saving 7 x 5 in image

Co-occurrences of SWEBoK Areas

swebokareas<-data %>% 
  select(all_of(swebok_areas_labels)) %>% # selecting columns corresponding to the SWEBoK Areas
  mutate_all(replace_na,0) %>% 
  as.matrix() %>% 
  crossprod()

swebokareas %>% 
  kable()
SR SD SC ST SM SCM SEM SEP SEMM SQ SEPP SEE CF MF EF
SR 49 18 5 2 4 0 7 2 4 0 7 0 0 0 1
SD 18 66 17 3 4 0 6 2 6 1 6 0 0 0 1
SC 5 17 77 5 22 1 3 2 2 0 3 0 0 0 0
ST 2 3 5 12 4 0 1 0 0 0 0 0 0 0 0
SM 4 4 22 4 46 1 2 1 0 0 1 0 0 0 0
SCM 0 0 1 0 1 2 0 1 0 0 0 0 0 0 0
SEM 7 6 3 1 2 0 26 3 1 0 7 3 0 0 1
SEP 2 2 2 0 1 1 3 10 0 0 2 1 0 0 0
SEMM 4 6 2 0 0 0 1 0 8 0 1 0 0 0 0
SQ 0 1 0 0 0 0 0 0 0 6 0 0 0 0 0
SEPP 7 6 3 0 1 0 7 2 1 0 18 3 0 0 1
SEE 0 0 0 0 0 0 3 1 0 0 3 5 0 0 0
CF 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
MF 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
EF 1 1 0 0 0 0 1 0 0 0 1 0 0 0 1
plot_ly(x=swebok_areas_labels, y=swebok_areas_labels, z=swebokareas, type="heatmap")
x <- data %>% select(all_of(swebok_areas_labels), all_of(cognitive_concepts_labels)) %>%
  mutate_all(replace_na, 0) %>%
  mutate(`Problem solving`, `Problem solving` = as.numeric(`Problem solving`)) %>% 
  gather(key="SWEBOK", value = pubs, swebok_areas_labels) %>% # use SWEBOK area as factor
  filter(pubs > 0) %>% # select areas for which there are publications
  group_by(SWEBOK) %>% 
  summarise_all(sum) %>% # number of publication for each area 
  select(-pubs) %>%  # remove pubs to reuse it later
  gather(key = "Taxonomy", value = "count", cognitive_concepts_labels) %>%  # count publications in each cognitive taxonomy area
  mutate(label = str_replace(as.character(count), "^0", "")) # add label for later
## Warning: NAs introduced by coercion
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(swebok_areas_labels)` instead of `swebok_areas_labels` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cognitive_concepts_labels)` instead of `cognitive_concepts_labels` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
# Bubble plot
x <- arrange(x, Taxonomy)
xf<-x$Taxonomy
xfu<-unique(xf)
x$Taxonomy<-factor(xf,levels=xfu)

p<-ggplot(x)
p + geom_point(aes(x = fct_infreq(SWEBOK), y = fct_rev(Taxonomy), size=count), shape=21, fill="white", alpha=0.60) +
geom_text(aes(x = fct_infreq(SWEBOK), y = fct_rev(Taxonomy), label=label), size=2) +
theme(axis.text.x = element_text(angle = 45, hjust = 1.1, size=9,colour="black"), axis.text.y = element_text(size=8,colour="black"), axis.title.x = element_text(size=10), axis.title.y = element_text(size=10,colour = "black",vjust=0.12), panel.grid.major = element_line(linetype = "dashed", size=0.1, color="black"))+
  labs(x="SWEBOK Area",y = "Taxonomy Area") + theme_bw()
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).

ggsave("PDFs/swebok_taxonomy_bubble.pdf")
## Saving 7 x 5 in image
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_text).
# Preparing the dataset for analysing the research methods
data<-data %>%
  mutate(Quantitative = case_when(`Quantit. measures` == 1 | `Task performance` == 1 | `Physiological meas.` == 1 | `Subjective ratings` == 1 | `Behavioral meas.` == 1 ~ 1)) %>% 
  mutate(Quantitative = replace_na(Quantitative, 0)) %>%  
  mutate(Qualitative = case_when(Fieldwork == 1 | Interview == 1 | `Qualit. measures` == 1 | `Task-based` == 1 | `Open observation` == 1 ~ 1)) %>%  
  mutate(Qualitative = replace_na(Qualitative, 0)) %>% 
  mutate(Both = if_else(Qualitative == 1 & Quantitative == 1, 1, 0))

The graphs below are prepared for IEEE Software Submission

Number of publications per year according to SWEBOK areas

# Creating a temp dataset with missing publications years (i.e., year for which there was no publication)
data %>% 
  filter(is.na(Exclude)) %>% 
  select(c(Year, SR:EF)) %>% 
  gather("SWEBOK", "publications", 2:16) %>% 
  mutate_all(replace_na, 0) %>%
  group_by(Year,SWEBOK)  %>% 
  summarise(total=sum(publications)) %>% 
  ggplot(aes(x=as.factor(Year), fill=SWEBOK, y=total)) +  geom_bar(stat="sum") +
  xlab("Year") + ylab("Publications") + scale_fill_discrete(name = "SWEBOK Areas") + guides(size = F) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))

ggsave("PDFs/years_swebok.pdf")
## Saving 7 x 5 in image

Evolution of research methods over the years

data <-  data %>% complete(Year=seq(1973,2016)) 

data <-  data %>% 
  mutate(research_method = if_else(Both==1, "Mixed", if_else(Qualitative==1, "Qualitative", "Quantitative")))  %>% 
  filter(!is.na(research_method))

data %>%  ggplot(aes(x=as.factor(Year), fill=research_method)) + geom_bar() + 
  scale_fill_discrete(name="Research method", labels = c("Mixed", "Qualitative", "Quantitative", "")) +
  xlab("Year") + ylab("Publications") +   
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

ggsave("PDFs/years_researchmethods.pdf")
## Saving 7 x 5 in image

Prevalence of research methods in the SWEBOK areas

data.swebok.researchmethod <- data %>% 
  select(swebok_areas_labels, research_method) %>% 
  mutate_all(replace_na,0) %>% 
  filter(research_method != 0) %>% 
  group_by(research_method) %>% 
  summarise_at(vars(swebok_areas_labels), sum) %>% 
  gather("SWEBOK", "Publications", swebok_areas_labels) 

data.swebok.researchmethod %>% 
  ggplot(aes(x=reorder(SWEBOK, Publications, function(x){sum(x)}), y=Publications, fill=research_method)) + geom_bar(stat = "identity") + 
  coord_flip() + xlab("SWEBOK areas") + scale_fill_discrete(name = "Research method")

ggsave("PDFs/SWEBOK_researchmethods.pdf")
## Saving 7 x 5 in image

Distribution of publications

data %>% 
  filter(!is.na(Identifier)) %>%
  select(Identifier, all_of(cognitive_concepts_labels), measures_labels) %>% 
  gather(Taxonomy, value, all_of(cognitive_concepts_labels)) %>% 
  filter(!is.na(value)) %>% 
  select(-value) %>% 
  gather(Method, value, measures_labels) %>% 
  filter(!is.na(value)) %>% 
  arrange(Identifier) %>% 
  select(-Identifier, -value) %>%  
  group_by(Taxonomy, Method) %>% 
  tally(name = "Amount") %>% 
  ggplot(aes(x=Method, y=Taxonomy, fill=Amount)) + 
  geom_point(aes(size=Amount), alpha=0.5) + 
  xlab("Cognitive Assessment Procedures") + 
  ylab("Cognitive Concepts") +
  theme(legend.position = "", axis.text.x = element_text(angle = 30, hjust = 1, size = 8))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(measures_labels)` instead of `measures_labels` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

ggsave("PDFs/taxonomy_methods.pdf")
## Saving 7 x 5 in image
data %>% 
  select(all_of(swebok_areas_labels_no_foundation), all_of(cognitive_concepts_labels)) %>% 
  mutate_all(replace_na,0) %>% 
  gather(Taxonomy, value2, cognitive_concepts_labels) %>% 
  add_high_level_concepts_to_data() %>% 
  gather(SWEBOK, value, swebok_areas_labels_no_foundation) %>% 
  count(SWEBOK, Concept, value, value2) %>% 
  mutate(freq=ifelse(value==1 & value2==1, n, 0)) %>% 
  distinct(SWEBOK, Concept, freq) %>% 
  group_by(SWEBOK, Concept) %>% 
  summarize(total=sum(freq)) %>% 
  ungroup() %>% 
  ggplot(aes(fct_relevel(SWEBOK, swebok_areas_labels_no_foundation), fct_rev(Concept), fill=total)) + 
  geom_tile() + scale_fill_continuous(low="#fff9f7", high="red") +
  xlab("SWEBOK areas") + ylab("Cognitive Concepts") + guides(fill=guide_legend(title="")) + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8)) +
  scale_x_discrete(labels = swebok_areas_labels_long)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(swebok_areas_labels_no_foundation)` instead of `swebok_areas_labels_no_foundation` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

ggsave("PDFs/taxomony_swebok_cooccurences.pdf")
## Saving 7 x 5 in image
data %>% 
  select(cognitive_concepts_labels,  measures_labels) %>% 
  mutate_all(replace_na,0) %>% 
  gather(Taxonomy, value, cognitive_concepts_labels) %>% 
  add_high_level_concepts_to_data() %>% 
  gather(Method, value2, measures_labels) %>% 
  count(Concept, Method, value, value2) %>%
  mutate(freq=ifelse(value==1 & value2==1, n, 0)) %>% 
  ggplot(aes(fct_relevel(Method, measures_labels), fct_rev(Concept), fill=freq)) + 
  geom_tile() +
  geom_vline(xintercept = 5.5, size=0.5,  color="darkgrey") +
  xlab("Cognitive Assessment Procedures") + ylab("Cognitive Concepts") + guides(fill=guide_legend(title="")) + 
  scale_x_discrete(labels=c("Fieldwork", "Interview", "Task-based", "Open observation", "Others", "Task performance", "Physiological meas.", "Subjective ratings", "Behavioral meas.", "Others")) + # not using measure_lables here since we need a catch-all "Others" category
  annotate(geom="text", x=8, y=0.73, label="Quantitative", size=3, alpha=0.4)+ annotate(geom="text", x=3, y=0.73, label="Qualitative", size=3, alpha=0.4) +
  scale_fill_continuous(low="#fff9f7", high="darkgreen") +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8))

ggsave("PDFs/taxonomy_method_cooccurences.pdf")
## Saving 7 x 5 in image
data %>% 
  select(Year, cognitive_concepts_labels)%>% 
  gather("Taxonomy", "publications", cognitive_concepts_labels) %>%
  mutate_all(replace_na,0) %>% 
  mutate(publications=as.integer(publications)) %>% 
  group_by(Year, Taxonomy) %>% 
  summarise(total=sum(publications)) %>% 
  ggplot(aes(as.factor(Year), total, fill=Taxonomy)) + geom_bar(stat="sum") +  xlab("Year") + ylab("Publications") + 
  scale_fill_discrete(name = "Taxonomy Areas") + guides(size = F) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))
## Warning: NAs introduced by coercion
## Warning: Removed 1 rows containing non-finite values (stat_sum).

 df.taxonomy  <- data %>% 
  select(Year, all_of(cognitive_concepts_labels)) %>%
  gather("Taxonomy", "publications", cognitive_concepts_labels) %>% 
  mutate_all(replace_na,0) %>% 
  mutate(publications=as.integer(publications)) %>% # for some reseason recognized as char
  filter(publications>0)
## Warning: NAs introduced by coercion
# need to create a separated df to hold the percentage of publications within each year
data.percentage <-  df.taxonomy  %>% 
  group_by(Year) %>% 
  count(Taxonomy) %>% 
  mutate(ratio = scales::percent(n/sum(n)))

df.taxonomy %>% 
  ggplot(aes(x = as.factor(Year), fill = as.factor(Taxonomy))) + 
  geom_bar(position="fill") +  
  geom_text(data = data.percentage, aes(y = n,label = ratio), position = position_fill(vjust = 0.5), colour = "white", size = 1.3) + 
  xlab("Year") + ylab("Publications %") +
  scale_fill_discrete(name = "Topic") + guides(size = F) +
  scale_y_continuous(labels = percent) + 
  theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) + 
  theme(legend.key.size = unit(.2, "cm"), legend.key.width = unit(0.2,"cm"), legend.title = element_text(size = 8), legend.text = element_text(size = 6))

ggsave("PDFs/taxonomy_years.pdf", width = unit(10, "inch"), height = unit(6.5, "inch"))
df.concepts <- df.taxonomy %>% 
  add_high_level_concepts_to_data() 
df.years <- data %>% filter(!(Year %in% years_no_publications)) %>% count(Year) # years without publications

ggplot() + 
  geom_bar(data=df.concepts, aes(x=as.factor(Year), fill=Concept), position="fill") + 
  geom_line(data=df.years, aes(x=as.factor(Year), y=n/max(n), group=1), size=0.8) + 
  geom_point(data=df.years, aes(x=as.factor(Year), y=n/max(n), group=1)) + 
  scale_y_continuous(labels = function(x)x*100, name="Publication %", sec.axis = sec_axis(name="Total publications", ~. * max(df.years$n), breaks=scales::breaks_extended(10))) + 
  xlab("Year")  + 
  theme(panel.background = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_text(margin = margin(-15,0,0,0, "pt")), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, vjust = 2.4)) + 
  scale_fill_manual(values = NPG_modified) 

ggsave("PDFs/concepts_years.pdf", width = unit(13, "inch"), height = unit(6.5, "inch"))